{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### In this notebook we compare the timing of encoding operations:\n",
    "- count encoding.\n",
    "- target encoding.\n",
    "- label encoding."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "GPU_id = 6\n",
    "os.environ['CUDA_VISIBLE_DEVICES'] = str(GPU_id)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "import warnings\n",
    "warnings.filterwarnings(\"ignore\")\n",
    "\n",
    "import cudf as gd\n",
    "import cupy as cp\n",
    "import pandas as pd\n",
    "import numpy as np\n",
    "import os\n",
    "import time\n",
    "import nvstrings\n",
    "import matplotlib.pyplot as plt\n",
    "%matplotlib inline"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Global"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "GPU_RUN_TIME = {}\n",
    "CPU_RUN_TIME = {}\n",
    "STEPS = []"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Functions"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "def on_gpu(words,func,arg=None,dtype=np.int32):\n",
    "    res = cp.array(words.size(), dtype=dtype)\n",
    "    if arg is None:\n",
    "        cmd = 'words.%s(devptr=res.device_ctypes_pointer.value)'%(func)\n",
    "    else:\n",
    "        cmd = 'words.%s(arg,devptr=res.device_ctypes_pointer.value)'%(func)\n",
    "    eval(cmd)\n",
    "    return res\n",
    "\n",
    "def count_items(data,cols):\n",
    "    dg = data.groupby(cols+['item_id'],\n",
    "            as_index=False).agg({'step':['count']})\n",
    "    if len(cols) == 0:\n",
    "        tag = 'global'\n",
    "    else:\n",
    "        tag = '_'.join(cols)\n",
    "    dg.columns = cols + ['item_id', 'count_item_%s'%tag]\n",
    "\n",
    "    if len(cols):\n",
    "        df = data.groupby(cols,\n",
    "            as_index=False).agg({'step':['count']})\n",
    "        df.columns = cols + ['count_item_%s_all'%tag]\n",
    "    \n",
    "        dg = dg.merge(df,on=cols,how='left')\n",
    "        dg['count_item_%s_norm'%tag] = dg['count_item_%s'%tag] / dg['count_item_%s_all'%tag]\n",
    "    \n",
    "        dg = dg.drop('count_item_%s_all'%tag,axis=1)\n",
    "        del df\n",
    "    return dg"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "YCOL = 'target'\n",
    "def mtr(tr,cols,te):\n",
    "    cols = cols + ['item_id']    \n",
    "    dg = tr.groupby(cols,as_index=False).agg({YCOL:'mean'})\n",
    "    if len(cols):\n",
    "        mcol = 'mtr_%s'%('_'.join(cols))\n",
    "    else:\n",
    "        mcol = 'mtr_global'\n",
    "    dg.columns = cols+[mcol]\n",
    "    te = te.merge(dg,on=cols,how='left')\n",
    "    del dg\n",
    "    return te,mcol\n",
    "\n",
    "def mtr_encode(train,test,cols):\n",
    "    # data should have these columns:\n",
    "    # row_id, column to compute mtr, target, item_id\n",
    "    # clickout_missing and all_row_id\n",
    "    train_x = train[cols+['item_id','row_id','all_row_id',YCOL]]\n",
    "    res = []\n",
    "    for i in range(8):\n",
    "        tr = train_x[train_x['row_id']%8!=i]\n",
    "        va = train_x[train_x['row_id']%8==i]\n",
    "        va,mcol = mtr(tr,cols,va)\n",
    "        res.append(va)\n",
    "        print(cols,'fold',i,'done')\n",
    "        del tr,va\n",
    "    if isinstance(train,gd.DataFrame):\n",
    "        res = gd.concat(res)\n",
    "    else:\n",
    "        res = pd.concat(res)\n",
    "    for col in res.columns:\n",
    "        if col not in ['all_row_id',mcol]:\n",
    "            res = res.drop(col,axis=1)\n",
    "    train = train.merge(res,on='all_row_id',how='left')\n",
    "    del res\n",
    "    test,_ = mtr(train_x,cols,test)\n",
    "    del train_x\n",
    "    return train,test,mcol"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Read pair data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "if os.path.exists('cache')==False:\n",
    "    os.mkdir('cache')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "path = '../cache/'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "step = 'prepare for target encoding'\n",
    "STEPS.append(step)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(42756036, 9) (5762533, 9)\n",
      "CPU times: user 30.2 s, sys: 17 s, total: 47.2 s\n",
      "Wall time: 27.2 s\n"
     ]
    }
   ],
   "source": [
    "%%time\n",
    "start = time.time()\n",
    "\n",
    "cols = ['user_id','item_id','clickout_missing','city','device','current_filters','row_id','target']\n",
    "data_pair_pd = pd.read_parquet('cache/data_pair.parquet')[cols]\n",
    "\n",
    "data_pair_pd['all_row_id'] = np.arange(data_pair_pd.shape[0])\n",
    "cols.append('all_row_id')\n",
    "data_pair_pd['current_filters'] = data_pair_pd['current_filters'].fillna('None')\n",
    "\n",
    "train_pair_pd = data_pair_pd[data_pair_pd['clickout_missing']==0]\n",
    "test_pair_pd = data_pair_pd[data_pair_pd['clickout_missing']>0]\n",
    "print(train_pair_pd.shape,test_pair_pd.shape)\n",
    "\n",
    "CPU_RUN_TIME[step] = time.time() - start"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### mean target encoding"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [],
   "source": [
    "step = 'mean target encoding'\n",
    "STEPS.append(step)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### cudf mean target encoding"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "In practice we also computed n-way mean target encoding as follows:\n",
    "- ['current_filters','city'],\n",
    "- ['current_filters','city','device'],\n",
    "- ['current_filters','device'],\n",
    "- ['current_filters','platform'],\n",
    "\n",
    "*which are not included in this notebook due to memory capacity bottleneck*"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "['current_filters'] fold 0 done\n",
      "['current_filters'] fold 1 done\n",
      "['current_filters'] fold 2 done\n",
      "['current_filters'] fold 3 done\n",
      "['current_filters'] fold 4 done\n",
      "['current_filters'] fold 5 done\n",
      "['current_filters'] fold 6 done\n",
      "['current_filters'] fold 7 done\n",
      "['city'] fold 0 done\n"
     ]
    }
   ],
   "source": [
    "%%time\n",
    "start = time.time()\n",
    "\n",
    "for cols in [\n",
    "    ['current_filters'],\n",
    "    ['city'],\n",
    "    ['device'],\n",
    "    ['user_id'],\n",
    "]:\n",
    "    \n",
    "    train_pair_pd,test_pair_pd,mcol = mtr_encode(train_pair_pd,test_pair_pd,cols=cols)\n",
    "print('mtr done')\n",
    "\n",
    "CPU_RUN_TIME[step] = time.time() - start"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Visualize the timing"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "CPU_RUN_TIME['Overall'] = sum([CPU_RUN_TIME[i] for i in STEPS])\n",
    "STEPS.append('Overall')\n",
    "\n",
    "timing = pd.DataFrame()\n",
    "timing['step'] = STEPS\n",
    "timing['CPU'] = [CPU_RUN_TIME[i] for i in STEPS]\n",
    "timing"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.8"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
